From c60a806bc21faef69d5fab9d13241e6dcc70186b Mon Sep 17 00:00:00 2001 From: "kaf24@firebug.cl.cam.ac.uk" Date: Mon, 7 Nov 2005 18:14:45 +0100 Subject: [PATCH] Fix pagetable pinning logic for xen/i386 kernels. The pin flag is now associated with the pgd rather than the mm -- this avoids a race where a pgd is allocated from the pgd_cache but, before it gets associated with an mm, the kernel suspends itself. At this point the kernel mappings will not get rewritten when the kernel is resumed, and the system will fail. A further advantage is that the code is slightly simpler and less invasive (no changes to mm_context for example). Signed-off-by: Keir Fraser --- .../arch/xen/i386/kernel/ldt.c | 13 +- linux-2.6-xen-sparse/arch/xen/i386/mm/init.c | 3 +- .../arch/xen/i386/mm/pgtable.c | 146 +++++++++--------- linux-2.6-xen-sparse/arch/xen/kernel/reboot.c | 2 +- .../include/asm-xen/asm-i386/mmu.h | 5 - .../include/asm-xen/asm-i386/mmu_context.h | 2 +- .../include/asm-xen/asm-i386/pgalloc.h | 5 +- 7 files changed, 85 insertions(+), 91 deletions(-) diff --git a/linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c b/linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c index 905dcd3928..5ef1ed10fc 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c @@ -18,7 +18,6 @@ #include #include #include -#include #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) @@ -101,19 +100,14 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) struct mm_struct * old_mm; int retval = 0; - memset(&mm->context, 0, sizeof(mm->context)); init_MUTEX(&mm->context.sem); + mm->context.size = 0; old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { down(&old_mm->context.sem); retval = copy_ldt(&mm->context, &old_mm->context); up(&old_mm->context.sem); } - if (retval == 0) { - spin_lock(&mm_unpinned_lock); - list_add(&mm->context.unpinned, &mm_unpinned); - spin_unlock(&mm_unpinned_lock); - } return retval; } @@ -134,11 +128,6 @@ void destroy_context(struct mm_struct *mm) kfree(mm->context.ldt); mm->context.size = 0; } - if (!mm->context.pinned) { - spin_lock(&mm_unpinned_lock); - list_del(&mm->context.unpinned); - spin_unlock(&mm_unpinned_lock); - } } static int read_ldt(void __user * ptr, unsigned long bytecount) diff --git a/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c b/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c index b5f7005012..bed3831a23 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c @@ -376,7 +376,6 @@ static void __init pagetable_init (void) __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; } - init_mm.context.pinned = 1; kernel_physical_mapping_init(pgd_base); remap_numa_kva(); @@ -689,6 +688,8 @@ void __init mem_init(void) #ifndef CONFIG_SMP zap_low_mappings(); #endif + + set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags); } kmem_cache_t *pgd_cache; diff --git a/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c b/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c index 503d48842a..3af8e92144 100644 --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c @@ -27,6 +27,9 @@ #include #include +static void __pgd_pin(pgd_t *pgd); +static void __pgd_unpin(pgd_t *pgd); + void show_mem(void) { int total = 0, reserved = 0; @@ -299,6 +302,8 @@ void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ + BUG_ON(test_bit(PG_pinned, &virt_to_page(pgd)->flags)); + if (HAVE_SHARED_KERNEL_PMD) return; @@ -312,6 +317,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) int i = 0; pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); + BUG_ON(test_bit(PG_pinned, &virt_to_page(pgd)->flags)); + if (PTRS_PER_PMD == 1 || !pgd) return pgd; @@ -351,15 +358,9 @@ out_oom: void pgd_free(pgd_t *pgd) { int i; - pte_t *ptep = virt_to_ptep(pgd); - if (!pte_write(*ptep)) { - xen_pgd_unpin(__pa(pgd)); - BUG_ON(HYPERVISOR_update_va_mapping( - (unsigned long)pgd, - pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL), - 0)); - } + if (test_bit(PG_pinned, &virt_to_page(pgd)->flags)) + __pgd_unpin(pgd); /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) { @@ -441,10 +442,7 @@ void make_pages_writable(void *va, unsigned int nr) } #endif /* CONFIG_XEN_SHADOW_MODE */ -LIST_HEAD(mm_unpinned); -DEFINE_SPINLOCK(mm_unpinned_lock); - -static inline void mm_walk_set_prot(void *pt, pgprot_t flags) +static inline void pgd_walk_set_prot(void *pt, pgprot_t flags) { struct page *page = virt_to_page(pt); unsigned long pfn = page_to_pfn(page); @@ -456,103 +454,111 @@ static inline void mm_walk_set_prot(void *pt, pgprot_t flags) pfn_pte(pfn, flags), 0)); } -static void mm_walk(struct mm_struct *mm, pgprot_t flags) +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int g,u,m; + pgd_t *pgd = pgd_base; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int g, u, m; - pgd = mm->pgd; for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { if (pgd_none(*pgd)) continue; pud = pud_offset(pgd, 0); if (PTRS_PER_PUD > 1) /* not folded */ - mm_walk_set_prot(pud,flags); + pgd_walk_set_prot(pud,flags); for (u = 0; u < PTRS_PER_PUD; u++, pud++) { if (pud_none(*pud)) continue; pmd = pmd_offset(pud, 0); if (PTRS_PER_PMD > 1) /* not folded */ - mm_walk_set_prot(pmd,flags); + pgd_walk_set_prot(pmd,flags); for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { if (pmd_none(*pmd)) continue; pte = pte_offset_kernel(pmd,0); - mm_walk_set_prot(pte,flags); + pgd_walk_set_prot(pte,flags); } } } + + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH)); +} + +static void __pgd_pin(pgd_t *pgd) +{ + pgd_walk(pgd, PAGE_KERNEL_RO); + xen_pgd_pin(__pa(pgd)); + set_bit(PG_pinned, &virt_to_page(pgd)->flags); +} + +static void __pgd_unpin(pgd_t *pgd) +{ + xen_pgd_unpin(__pa(pgd)); + pgd_walk(pgd, PAGE_KERNEL); + clear_bit(PG_pinned, &virt_to_page(pgd)->flags); } void mm_pin(struct mm_struct *mm) { - spin_lock(&mm->page_table_lock); - - mm_walk(mm, PAGE_KERNEL_RO); - BUG_ON(HYPERVISOR_update_va_mapping( - (unsigned long)mm->pgd, - pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), - UVMF_TLB_FLUSH)); - xen_pgd_pin(__pa(mm->pgd)); - mm->context.pinned = 1; - spin_lock(&mm_unpinned_lock); - list_del(&mm->context.unpinned); - spin_unlock(&mm_unpinned_lock); - - spin_unlock(&mm->page_table_lock); + spin_lock(&mm->page_table_lock); + __pgd_pin(mm->pgd); + spin_unlock(&mm->page_table_lock); } void mm_unpin(struct mm_struct *mm) { - spin_lock(&mm->page_table_lock); - - xen_pgd_unpin(__pa(mm->pgd)); - BUG_ON(HYPERVISOR_update_va_mapping( - (unsigned long)mm->pgd, - pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0)); - mm_walk(mm, PAGE_KERNEL); - xen_tlb_flush(); - mm->context.pinned = 0; - spin_lock(&mm_unpinned_lock); - list_add(&mm->context.unpinned, &mm_unpinned); - spin_unlock(&mm_unpinned_lock); - - spin_unlock(&mm->page_table_lock); + spin_lock(&mm->page_table_lock); + __pgd_unpin(mm->pgd); + spin_unlock(&mm->page_table_lock); } void mm_pin_all(void) { - while (!list_empty(&mm_unpinned)) - mm_pin(list_entry(mm_unpinned.next, struct mm_struct, - context.unpinned)); + struct page *page; + for (page = pgd_list; page; page = (struct page *)page->index) { + if (!test_bit(PG_pinned, &page->flags)) + __pgd_pin((pgd_t *)page_address(page)); + } } void _arch_exit_mmap(struct mm_struct *mm) { - struct task_struct *tsk = current; + struct task_struct *tsk = current; - task_lock(tsk); + task_lock(tsk); - /* - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() - * *much* faster this way, as no tlb flushes means bigger wrpt batches. - */ - if ( tsk->active_mm == mm ) - { - tsk->active_mm = &init_mm; - atomic_inc(&init_mm.mm_count); + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if (tsk->active_mm == mm) { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); - switch_mm(mm, &init_mm, tsk); + switch_mm(mm, &init_mm, tsk); - atomic_dec(&mm->mm_count); - BUG_ON(atomic_read(&mm->mm_count) == 0); - } + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } - task_unlock(tsk); + task_unlock(tsk); - if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) ) - mm_unpin(mm); + if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) && + (atomic_read(&mm->mm_count) == 1)) + mm_unpin(mm); } + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff --git a/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c b/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c index d017cf9620..89c2005450 100644 --- a/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c +++ b/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c @@ -129,8 +129,8 @@ static int __do_suspend(void *ignore) preempt_disable(); #ifdef __i386__ - mm_pin_all(); kmem_cache_shrink(pgd_cache); + mm_pin_all(); #endif __cli(); diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu.h index b628b46f3b..32987b80b1 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu.h @@ -12,13 +12,8 @@ typedef struct { int size; struct semaphore sem; void *ldt; - unsigned pinned:1; - struct list_head unpinned; } mm_context_t; -extern struct list_head mm_unpinned; -extern spinlock_t mm_unpinned_lock; - /* mm/memory.c:exit_mmap hook */ extern void _arch_exit_mmap(struct mm_struct *mm); #define arch_exit_mmap(_mm) _arch_exit_mmap(_mm) diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h index c5567bc9b3..129f79aba0 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h @@ -53,7 +53,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mmuext_op _op[2], *op = _op; if (likely(prev != next)) { - if (!next->context.pinned) + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)) mm_pin(next); /* stop flush ipis for the previous mm */ diff --git a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h index f559b93a5f..98e802babc 100644 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h @@ -7,12 +7,15 @@ #include /* for struct page */ #include /* for phys_to_virt and page_to_pseudophys */ +/* Is this pagetable pinned? */ +#define PG_pinned PG_arch_1 + #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) #define pmd_populate(mm, pmd, pte) \ do { \ - if (unlikely((mm)->context.pinned)) { \ + if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) { \ if (!PageHighMem(pte)) \ BUG_ON(HYPERVISOR_update_va_mapping( \ (unsigned long)__va(page_to_pfn(pte)<